{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Transformation with Scikit-Learn In This Notebook\n",
    "In this notebook, we convert raw text into BERT embeddings.  This will allow us to perform natural language processing tasks such as text classification."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![](img/prepare_dataset_bert.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "import boto3\n",
    "\n",
    "sagemaker_session = sagemaker.Session()\n",
    "role = sagemaker.get_execution_role()\n",
    "bucket = sagemaker_session.default_bucket()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert Raw Text to BERT Features using Hugging Face and TensorFlow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import collections\n",
    "import json\n",
    "import os\n",
    "import pandas as pd\n",
    "import csv\n",
    "from transformers import DistilBertTokenizer\n",
    "\n",
    "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
    "\n",
    "DATA_COLUMN = 'review_body'\n",
    "LABEL_COLUMN = 'star_rating'\n",
    "LABEL_VALUES = [1, 2, 3, 4, 5]\n",
    "\n",
    "label_map = {}\n",
    "for (i, label) in enumerate(LABEL_VALUES):\n",
    "    label_map[label] = i\n",
    "\n",
    "    \n",
    "class InputFeatures(object):\n",
    "  \"\"\"BERT feature vectors.\"\"\"\n",
    "\n",
    "  def __init__(self,\n",
    "               input_ids,\n",
    "               input_mask,\n",
    "               segment_ids,\n",
    "               label_id):\n",
    "    self.input_ids = input_ids\n",
    "    self.input_mask = input_mask\n",
    "    self.segment_ids = segment_ids\n",
    "    self.label_id = label_id\n",
    "    \n",
    "    \n",
    "class Input(object):\n",
    "  \"\"\"A single training/test input for sequence classification.\"\"\"\n",
    "\n",
    "  def __init__(self, text, label=None):\n",
    "    \"\"\"Constructs an Input.\n",
    "    Args:\n",
    "      text: string. The untokenized text of the first sequence. For single\n",
    "        sequence tasks, only this sequence must be specified.\n",
    "      label: (Optional) string. The label of the example. This should be\n",
    "        specified for train and dev examples, but not for test examples.\n",
    "    \"\"\"\n",
    "    self.text = text\n",
    "    self.label = label\n",
    "    \n",
    "\n",
    "def convert_input(text_input, max_seq_length):\n",
    "    # First, we need to preprocess our data so that it matches the data BERT was trained on:\n",
    "    # 1. Lowercase our text (if we're using a BERT lowercase model)\n",
    "    # 2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n",
    "    # 3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n",
    "    # \n",
    "    # Fortunately, the Transformers tokenizer does this for us!\n",
    "\n",
    "    tokens = tokenizer.tokenize(text_input.text)\n",
    "    print('**tokens**\\n{}\\n'.format(tokens))\n",
    "\n",
    "    encode_plus_tokens = tokenizer.encode_plus(text_input.text,\n",
    "                                               pad_to_max_length=True,\n",
    "                                               max_length=max_seq_length,\n",
    "#                                               truncation=True\n",
    "                                              )\n",
    "\n",
    "    # The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)\n",
    "    input_ids = encode_plus_tokens['input_ids']\n",
    "    \n",
    "    # Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.    \n",
    "    input_mask = encode_plus_tokens['attention_mask']\n",
    "\n",
    "    # Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.\n",
    "    segment_ids = [0] * max_seq_length\n",
    "\n",
    "    # Label for each training row (`star_rating` 1 through 5)\n",
    "    label_id = label_map[text_input.label]\n",
    "\n",
    "    features = InputFeatures(\n",
    "        input_ids=input_ids,\n",
    "        input_mask=input_mask,\n",
    "        segment_ids=segment_ids,\n",
    "        label_id=label_id)\n",
    "\n",
    "    print('**input_ids**\\n{}\\n'.format(features.input_ids))\n",
    "    print('**input_mask**\\n{}\\n'.format(features.input_mask))\n",
    "    print('**segment_ids**\\n{}\\n'.format(features.segment_ids))\n",
    "    print('**label_id**\\n{}\\n'.format(features.label_id))\n",
    "\n",
    "    return features\n",
    "\n",
    "\n",
    "# We'll need to transform our data into a format that BERT understands.\n",
    "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n",
    "# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data\n",
    "def transform_inputs_to_tfrecord(inputs, max_seq_length):\n",
    "    tf_records = []\n",
    "    for (input_idx, text_input) in enumerate(inputs):\n",
    "      if input_idx % 10000 == 0:\n",
    "          print('Writing input {} of {}\\n'.format(input_idx, len(inputs)))\n",
    "\n",
    "      features = convert_input(text_input, max_seq_length)\n",
    "        \n",
    "      all_features = collections.OrderedDict()\n",
    "      all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n",
    "      all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n",
    "      all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n",
    "      all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n",
    "\n",
    "      tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))\n",
    "      tf_records.append(tf_record.SerializeToString())\n",
    "\n",
    "    return tf_records\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Three(3) feature vectors are created from each raw review (`review_body`) during the feature engineering phase to prepare for BERT processing:\n",
    "\n",
    "* **`input_ids`**:  The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)\n",
    "    \n",
    "* **`input_mask`**:  Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.\n",
    "\n",
    "* **`segment_ids`**:  Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.\n",
    "\n",
    "And one(1) label is created from each raw review (`star_rating`)  :\n",
    "\n",
    "* **`label_id`**:  Label for each training row (`star_rating` 1 through 5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demonstrate the BERT-specific Feature Engineering Step\n",
    "While we are demonstrating this code with a small amount of data here in the notebook, we will soon scale this to much more data on a powerful SageMaker cluster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "data = [\n",
    "        [5,\"\"\"I needed an antivirus application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get.\"\"\"],\n",
    "        [3,\"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\"],\n",
    "        [1,\"\"\"Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses\"\"\"]\n",
    "       ]\n",
    "\n",
    "df = pd.DataFrame(data, columns=['star_rating','review_body'])\n",
    "\n",
    "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n",
    "inputs = df.apply(lambda x: Input(text = x[DATA_COLUMN], \n",
    "                                  label = x[LABEL_COLUMN]), \n",
    "                  axis = 1)\n",
    "\n",
    "max_seq_length = 64\n",
    "tf_records = transform_inputs_to_tfrecord(inputs, max_seq_length)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The three(3) features vectors and one(1) label are converted into a list of `TFRecord` instances (1 per each row of training data):\n",
    "* **`tf_records`**:  Binary representation of each row of training data (3 features + 1 label)\n",
    "\n",
    "These `TFRecord`s are the engineered features that we will use throughout the rest of the pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('**tf_records**')\n",
    "\n",
    "for tf_record in tf_records:\n",
    "    print(tf_record)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%javascript\n",
    "Jupyter.notebook.save_checkpoint();\n",
    "Jupyter.notebook.session.delete();"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
